import re
import requests
from pyquery import PyQuery as pq
from sql_helper import MySqlHelper
import json
import time
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1.txt; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
headers = {
    'User-Agent':USER_AGENT
}



def read_jd_list(url, item_dict_all=None):
    if item_dict_all is None:
        item_dict_all = list()

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        html = pq(response.text)
        titles = html('#main > div > div.job-list > ul > li > div > div.info-primary > h3 > a > div.job-title').items()
        rmbs = html('#main > div > div.job-list > ul > li > div > div.info-primary > h3 > a > span').items()
        gongsi = html('#main > div > div.job-list > ul > li > div > div.info-company > div > h3 > a').items()
        diqu = html('#main > div > div.job-list > ul > li > div > div.info-primary > p').items()
        contents = html('#main > div > div.job-list > ul > li > div > div.info-primary > h3 > a').items()
        for title, rmb, gs, dq, content_url in zip(titles, rmbs, gongsi, diqu, contents):
            res = re.compile('<p>(.*?)<em', re.S)
            diq = re.findall(res, str(dq))
            list_1 = list()
            res_2 = re.compile('/>(.*?)<em', re.S)
            jingyan = re.findall(res_2, str(dq))
            res_3 = re.compile('<em.*?/>(.*?)</p>', re.S)
            xueli = re.findall(res_3, str(dq))
            time_1 = 0


            html_2 = requests.get('https://www.zhipin.com' + content_url.attr('href'), headers=headers)
            cont_html = pq(html_2.text)
            d_1 = cont_html(
                '#main > div.job-box > div > div.job-detail > div div.location-address').text()
            welfare = cont_html(
                '#main > div.job-box > div > div.job-detail > div.detail-content > div:nth-child(2) > div.job-tags').text()

            condition = cont_html(
                '#main > div.job-box > div > div.job-detail > div.detail-content > div:nth-child(1) > div').text()

            # 岗位, 公司名称, 地址, 城市, 任职条件, 薪资待遇
            list_1.append(title.text())  # 岗位
            list_1.append(gs.text())  # 公司
            list_1.append(d_1)  # 地址
            list_1.append(diq[0].strip())  # 地区
            list_1.append(rmb.text())  # 工资
            list_1.append(condition)  # 任职条件
            list_1.append(jingyan[0].strip())  # 工龄
            list_1.append(xueli[0][-2:])  # 学历
            list_1.append(welfare)  # 公司福利待遇

            item_dict_all.append(list_1)
            for i in range(0, 2):
                time.sleep(1)
                time_1 += 1
                print(i)
            print('正在爬取：', diq[0].strip())


    else:
        print("你抓的出问题了！")

    return item_dict_all


def code():
    city = requests.get('https://www.zhipin.com/common/data/city.json')
    city_text = json.loads(city.text)
    city_list = list()
    cit = city_text['data']['cityList']
    for i in cit:
        #  北京 上海，广州 杭州 深圳 ,武汉,重庆 成都,郑州  西安, 济南, 长沙
        b = ['北京', '上海', '广州', '杭州', '深圳', '武汉', '重庆', '成都', '郑州', '西安', '济南', '长沙']
        for b_s in b:
            if i['subLevelModelList'][0]['name'] == b_s:
                city_list.append(i['subLevelModelList'][0]['code'])

    return city_list



if __name__ == "__main__":
    ci = 0
    list_s = code()
    for i in list_s:
        url = "https://www.zhipin.com/c"+str(i)+"-p100109"
        item_dict_all = read_jd_list(url)
        mysql = MySqlHelper()
        # 岗位, 公司名称, 地址, 城市, 工资, 任职条件, 经验, 学历, 薪资待遇
        sql = "INSERT INTO boos(`post`, `company`, `site`, `city`, `salary`, `condition`, `experience`, `education`, `welfare`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        print(item_dict_all)
        mysql.exec_many(sql, item_dict_all)
        a = 0
        for s in range(0, 60):
            a += 1
            time.sleep(1)
            print(a)